├── MANIFEST.in ├── README.md ├── setup.py ├── LICENSE ├── test_pcalg.py └── pcalg.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CPDAG Estimation using PC-Algorithm 2 | 3 | ## Overview 4 | 5 | This package provides functions to estimate a skeleton graph and a 6 | completed partially acyclic graph (CPDAG) from a data matrix that 7 | describes the appearance patterns of the graph nodes. The detailed 8 | algorithm is written in [Kalisch2007]. 9 | 10 | ## Code 11 | 12 | The source code is available at https://github.com/keiichishima/pcalg 13 | 14 | ## Bug Reports 15 | 16 | Please submit bug reports or patches through the GitHub interface. 17 | 18 | ## Contributors 19 | 20 | - Satoru Kobayashi, https://github.com/cpflat 21 | - basilnsaeed, https://github.com/basilnsaeed 22 | - Dr. Georg M. Goerg 23 | - limjcst, https://github.com/limjcst 24 | - Alexandre Drouin, https://github.com/aldro61 25 | 26 | ## References 27 | 28 | [Kalisch2007] Markus Kalisch and Peter Bhlmann. Estimating 29 | high-dimensional directed acyclic graphs with the pc-algorithm. In The 30 | Journal of Machine Learning Research, Vol. 8, pp. 613-636, 2007. 31 | 32 | ## Author 33 | 34 | Keiichi SHIMA 35 | / Internet Initiative Japan Inc. 36 | / WIDE project 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | from setuptools import setup 6 | 7 | try: 8 | from pypandoc import convert 9 | read_md = lambda f: convert(f, 'rst') 10 | except ImportError: 11 | print('pandoc is not installed.') 12 | read_md = lambda f: open(f, 'r').read() 13 | 14 | setup(name='pcalg', 15 | version='0.2.2', 16 | description='CPDAG Estimation using PC-Algorithm', 17 | long_description=read_md('README.md'), 18 | author='Keiichi SHIMA', 19 | author_email='keiichi@iijlab.net', 20 | url='https://github.com/keiichishima/pcalg/', 21 | py_modules=['pcalg'], 22 | install_requires=['networkx>=2.0', 'gsq>=0.1.6'], 23 | classifiers=[ 24 | 'Development Status :: 4 - Beta', 25 | 'Environment :: Console', 26 | 'Intended Audience :: Information Technology', 27 | 'Intended Audience :: Science/Research', 28 | 'License :: OSI Approved :: BSD License', 29 | 'Programming Language :: Python :: 2.7', 30 | 'Topic :: Scientific/Engineering :: Information Analysis', 31 | 'Topic :: Software Development :: Libraries :: Python Modules'], 32 | license='BSD License', 33 | ) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022, Internet Initiative Japan Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /test_pcalg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Test suite for pcalg 4 | ''' 5 | import networkx as nx 6 | import numpy as np 7 | 8 | from gsq.ci_tests import ci_test_bin, ci_test_dis 9 | from gsq.gsq_testdata import bin_data, dis_data 10 | 11 | import pytest 12 | 13 | from pcalg import estimate_cpdag 14 | from pcalg import estimate_skeleton 15 | 16 | @pytest.mark.parametrize(('indep_test_func', 'data_matrix', 'g_answer'), [ 17 | (ci_test_bin, np.array(bin_data).reshape((5000, 5)), nx.DiGraph({ 18 | 0: (1, ), 19 | 1: (), 20 | 2: (3, 4), 21 | 3: (1, 2), 22 | 4: (1, 2), 23 | })), 24 | (ci_test_dis, np.array(dis_data).reshape((10000, 5)), nx.DiGraph({ 25 | 0: (2, ), 26 | 1: (2, 3), 27 | 2: (), 28 | 3: (), 29 | 4: (3, ), 30 | })), 31 | ]) 32 | def test_estimate_cpdag(indep_test_func, data_matrix, g_answer, alpha=0.01): 33 | ''' 34 | estimate_cpdag should reveal the answer 35 | ''' 36 | (graph, sep_set) = estimate_skeleton(indep_test_func=indep_test_func, 37 | data_matrix=data_matrix, 38 | alpha=alpha) 39 | graph = estimate_cpdag(skel_graph=graph, sep_set=sep_set) 40 | error_msg = 'True edges should be: %s' % (g_answer.edges(), ) 41 | assert nx.is_isomorphic(graph, g_answer), error_msg 42 | 43 | def test_fixed_edges(): 44 | ''' 45 | The fixed edges shall appear in the skeleton 46 | ''' 47 | data_matrix = np.array(bin_data).reshape((5000, 5)) 48 | (graph, sep_set) = estimate_skeleton(indep_test_func=ci_test_bin, 49 | data_matrix=data_matrix, 50 | alpha=0.01) 51 | graph = estimate_cpdag(skel_graph=graph, sep_set=sep_set) 52 | assert not graph.has_edge(1, 2) 53 | 54 | fixed_edges = nx.DiGraph() 55 | fixed_edges.add_nodes_from(range(5)) 56 | fixed_edges.add_edge(1, 2) 57 | with pytest.raises(ValueError): 58 | _ = estimate_skeleton(indep_test_func=ci_test_bin, 59 | data_matrix=data_matrix, 60 | alpha=0.01, 61 | fixed_edges=((1,2), )) 62 | with pytest.raises(ValueError): 63 | _ = estimate_skeleton(indep_test_func=ci_test_bin, 64 | data_matrix=data_matrix, 65 | alpha=0.01, 66 | fixed_edges=nx.DiGraph({0: (1, )})) 67 | (graph, _) = estimate_skeleton(indep_test_func=ci_test_bin, 68 | data_matrix=data_matrix, 69 | alpha=0.01, 70 | fixed_edges=fixed_edges) 71 | assert graph.has_edge(1, 2), graph.edges 72 | -------------------------------------------------------------------------------- /pcalg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """A graph generator based on the PC algorithm [Kalisch2007]. 5 | 6 | [Kalisch2007] Markus Kalisch and Peter Bhlmann. Estimating 7 | high-dimensional directed acyclic graphs with the pc-algorithm. In The 8 | Journal of Machine Learning Research, Vol. 8, pp. 613-636, 2007. 9 | 10 | License: BSD 11 | """ 12 | 13 | from __future__ import print_function 14 | 15 | from itertools import combinations, permutations 16 | import logging 17 | 18 | import networkx as nx 19 | 20 | _logger = logging.getLogger(__name__) 21 | 22 | def _create_complete_graph(node_ids): 23 | """Create a complete graph from the list of node ids. 24 | 25 | Args: 26 | node_ids: a list of node ids 27 | 28 | Returns: 29 | An undirected graph (as a networkx.Graph) 30 | """ 31 | g = nx.Graph() 32 | g.add_nodes_from(node_ids) 33 | for (i, j) in combinations(node_ids, 2): 34 | g.add_edge(i, j) 35 | return g 36 | 37 | def estimate_skeleton(indep_test_func, data_matrix, alpha, **kwargs): 38 | """Estimate a skeleton graph from the statistis information. 39 | 40 | Args: 41 | indep_test_func: the function name for a conditional 42 | independency test. 43 | data_matrix: data (as a numpy array). 44 | alpha: the significance level. 45 | kwargs: 46 | 'max_reach': maximum value of l (see the code). The 47 | value depends on the underlying distribution. 48 | 'method': if 'stable' given, use stable-PC algorithm 49 | (see [Colombo2014]). 50 | 'init_graph': initial structure of skeleton graph 51 | (as a networkx.Graph). If not specified, 52 | a complete graph is used. 53 | 'fixed_edges': Undirected edges marked here are not changed 54 | (as a networkx.Graph). If not specified, 55 | an empty graph is used. 56 | other parameters may be passed depending on the 57 | indep_test_func()s. 58 | Returns: 59 | g: a skeleton graph (as a networkx.Graph). 60 | sep_set: a separation set (as an 2D-array of set()). 61 | 62 | [Colombo2014] Diego Colombo and Marloes H Maathuis. Order-independent 63 | constraint-based causal structure learning. In The Journal of Machine 64 | Learning Research, Vol. 15, pp. 3741-3782, 2014. 65 | """ 66 | 67 | def method_stable(kwargs): 68 | return ('method' in kwargs) and kwargs['method'] == "stable" 69 | 70 | node_ids = range(data_matrix.shape[1]) 71 | node_size = data_matrix.shape[1] 72 | sep_set = [[set() for i in range(node_size)] for j in range(node_size)] 73 | if 'init_graph' in kwargs: 74 | g = kwargs['init_graph'] 75 | if not isinstance(g, nx.Graph): 76 | raise ValueError 77 | elif not g.number_of_nodes() == len(node_ids): 78 | raise ValueError('init_graph not matching data_matrix shape') 79 | for (i, j) in combinations(node_ids, 2): 80 | if not g.has_edge(i, j): 81 | sep_set[i][j] = None 82 | sep_set[j][i] = None 83 | else: 84 | g = _create_complete_graph(node_ids) 85 | 86 | fixed_edges = set() 87 | if 'fixed_edges' in kwargs: 88 | _fixed_edges = kwargs['fixed_edges'] 89 | if not isinstance(_fixed_edges, nx.Graph): 90 | raise ValueError 91 | if not _fixed_edges.number_of_nodes() == len(node_ids): 92 | raise ValueError('fixed_edges not matching data_matrix shape') 93 | for (i, j) in _fixed_edges.edges: 94 | fixed_edges.add((i, j)) 95 | fixed_edges.add((j, i)) 96 | 97 | l = 0 98 | while True: 99 | cont = False 100 | remove_edges = [] 101 | for (i, j) in permutations(node_ids, 2): 102 | if (i, j) in fixed_edges: 103 | continue 104 | 105 | adj_i = list(g.neighbors(i)) 106 | if j not in adj_i: 107 | continue 108 | else: 109 | adj_i.remove(j) 110 | if len(adj_i) >= l: 111 | _logger.debug('testing %s and %s' % (i,j)) 112 | _logger.debug('neighbors of %s are %s' % (i, str(adj_i))) 113 | if len(adj_i) < l: 114 | continue 115 | for k in combinations(adj_i, l): 116 | _logger.debug('indep prob of %s and %s with subset %s' 117 | % (i, j, str(k))) 118 | p_val = indep_test_func(data_matrix, i, j, set(k), 119 | **kwargs) 120 | _logger.debug('p_val is %s' % str(p_val)) 121 | if p_val > alpha: 122 | if g.has_edge(i, j): 123 | _logger.debug('p: remove edge (%s, %s)' % (i, j)) 124 | if method_stable(kwargs): 125 | remove_edges.append((i, j)) 126 | else: 127 | g.remove_edge(i, j) 128 | sep_set[i][j] |= set(k) 129 | sep_set[j][i] |= set(k) 130 | break 131 | cont = True 132 | l += 1 133 | if method_stable(kwargs): 134 | g.remove_edges_from(remove_edges) 135 | if cont is False: 136 | break 137 | if ('max_reach' in kwargs) and (l > kwargs['max_reach']): 138 | break 139 | 140 | return (g, sep_set) 141 | 142 | def estimate_cpdag(skel_graph, sep_set): 143 | """Estimate a CPDAG from the skeleton graph and separation sets 144 | returned by the estimate_skeleton() function. 145 | 146 | Args: 147 | skel_graph: A skeleton graph (an undirected networkx.Graph). 148 | sep_set: An 2D-array of separation set. 149 | The contents look like something like below. 150 | sep_set[i][j] = set([k, l, m]) 151 | 152 | Returns: 153 | An estimated DAG. 154 | """ 155 | dag = skel_graph.to_directed() 156 | node_ids = skel_graph.nodes() 157 | for (i, j) in combinations(node_ids, 2): 158 | adj_i = set(dag.successors(i)) 159 | if j in adj_i: 160 | continue 161 | adj_j = set(dag.successors(j)) 162 | if i in adj_j: 163 | continue 164 | if sep_set[i][j] is None: 165 | continue 166 | common_k = adj_i & adj_j 167 | for k in common_k: 168 | if k not in sep_set[i][j]: 169 | if dag.has_edge(k, i): 170 | _logger.debug('S: remove edge (%s, %s)' % (k, i)) 171 | dag.remove_edge(k, i) 172 | if dag.has_edge(k, j): 173 | _logger.debug('S: remove edge (%s, %s)' % (k, j)) 174 | dag.remove_edge(k, j) 175 | 176 | def _has_both_edges(dag, i, j): 177 | return dag.has_edge(i, j) and dag.has_edge(j, i) 178 | 179 | def _has_any_edge(dag, i, j): 180 | return dag.has_edge(i, j) or dag.has_edge(j, i) 181 | 182 | def _has_one_edge(dag, i, j): 183 | return ((dag.has_edge(i, j) and (not dag.has_edge(j, i))) or 184 | (not dag.has_edge(i, j)) and dag.has_edge(j, i)) 185 | 186 | def _has_no_edge(dag, i, j): 187 | return (not dag.has_edge(i, j)) and (not dag.has_edge(j, i)) 188 | 189 | # For all the combination of nodes i and j, apply the following 190 | # rules. 191 | old_dag = dag.copy() 192 | while True: 193 | for (i, j) in permutations(node_ids, 2): 194 | # Rule 1: Orient i-j into i->j whenever there is an arrow k->i 195 | # such that k and j are nonadjacent. 196 | # 197 | # Check if i-j. 198 | if _has_both_edges(dag, i, j): 199 | # Look all the predecessors of i. 200 | for k in dag.predecessors(i): 201 | # Skip if there is an arrow i->k. 202 | if dag.has_edge(i, k): 203 | continue 204 | # Skip if k and j are adjacent. 205 | if _has_any_edge(dag, k, j): 206 | continue 207 | # Make i-j into i->j 208 | _logger.debug('R1: remove edge (%s, %s)' % (j, i)) 209 | dag.remove_edge(j, i) 210 | break 211 | 212 | # Rule 2: Orient i-j into i->j whenever there is a chain 213 | # i->k->j. 214 | # 215 | # Check if i-j. 216 | if _has_both_edges(dag, i, j): 217 | # Find nodes k where k is i->k. 218 | succs_i = set() 219 | for k in dag.successors(i): 220 | if not dag.has_edge(k, i): 221 | succs_i.add(k) 222 | # Find nodes j where j is k->j. 223 | preds_j = set() 224 | for k in dag.predecessors(j): 225 | if not dag.has_edge(j, k): 226 | preds_j.add(k) 227 | # Check if there is any node k where i->k->j. 228 | if len(succs_i & preds_j) > 0: 229 | # Make i-j into i->j 230 | _logger.debug('R2: remove edge (%s, %s)' % (j, i)) 231 | dag.remove_edge(j, i) 232 | 233 | # Rule 3: Orient i-j into i->j whenever there are two chains 234 | # i-k->j and i-l->j such that k and l are nonadjacent. 235 | # 236 | # Check if i-j. 237 | if _has_both_edges(dag, i, j): 238 | # Find nodes k where i-k. 239 | adj_i = set() 240 | for k in dag.successors(i): 241 | if dag.has_edge(k, i): 242 | adj_i.add(k) 243 | # For all the pairs of nodes in adj_i, 244 | for (k, l) in combinations(adj_i, 2): 245 | # Skip if k and l are adjacent. 246 | if _has_any_edge(dag, k, l): 247 | continue 248 | # Skip if not k->j. 249 | if dag.has_edge(j, k) or (not dag.has_edge(k, j)): 250 | continue 251 | # Skip if not l->j. 252 | if dag.has_edge(j, l) or (not dag.has_edge(l, j)): 253 | continue 254 | # Make i-j into i->j. 255 | _logger.debug('R3: remove edge (%s, %s)' % (j, i)) 256 | dag.remove_edge(j, i) 257 | break 258 | 259 | # Rule 4: Orient i-j into i->j whenever there are two chains 260 | # i-k->l and k->l->j such that k and j are nonadjacent. 261 | # 262 | # However, this rule is not necessary when the PC-algorithm 263 | # is used to estimate a DAG. 264 | 265 | if nx.is_isomorphic(dag, old_dag): 266 | break 267 | old_dag = dag.copy() 268 | 269 | return dag 270 | 271 | if __name__ == '__main__': 272 | import networkx as nx 273 | import numpy as np 274 | 275 | from gsq.ci_tests import ci_test_bin, ci_test_dis 276 | from gsq.gsq_testdata import bin_data, dis_data 277 | 278 | # ch = logging.StreamHandler() 279 | # ch.setLevel(logging.DEBUG) 280 | # _logger.setLevel(logging.DEBUG) 281 | # _logger.addHandler(ch) 282 | 283 | dm = np.array(bin_data).reshape((5000, 5)) 284 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_bin, 285 | data_matrix=dm, 286 | alpha=0.01) 287 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set) 288 | g_answer = nx.DiGraph() 289 | g_answer.add_nodes_from([0, 1, 2, 3, 4]) 290 | g_answer.add_edges_from([(0, 1), (2, 3), (3, 2), (3, 1), 291 | (2, 4), (4, 2), (4, 1)]) 292 | print('Edges are:', g.edges(), end='') 293 | if nx.is_isomorphic(g, g_answer): 294 | print(' => GOOD') 295 | else: 296 | print(' => WRONG') 297 | print('True edges should be:', g_answer.edges()) 298 | 299 | dm = np.array(dis_data).reshape((10000, 5)) 300 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_dis, 301 | data_matrix=dm, 302 | alpha=0.01, 303 | levels=[3,2,3,4,2]) 304 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set) 305 | g_answer = nx.DiGraph() 306 | g_answer.add_nodes_from([0, 1, 2, 3, 4]) 307 | g_answer.add_edges_from([(0, 2), (1, 2), (1, 3), (4, 3)]) 308 | print('Edges are:', g.edges(), end='') 309 | if nx.is_isomorphic(g, g_answer): 310 | print(' => GOOD') 311 | else: 312 | print(' => WRONG') 313 | print('True edges should be:', g_answer.edges()) 314 | --------------------------------------------------------------------------------